data/transcript_counts and data/objects folders respectively.annotation folder.library(scater)
library(Seurat)
library(tibble)
library(dplyr)
library(GenomicFeatures)
library(org.Dr.eg.db)
library(Danio.rerio.Zv9.77.gtf)
library(ggplot2)
library(patchwork)
Analysis performed using Seurat (v3.1.1 and uwot_0.1.5, largely based on guidelines described here[https://satijalab.org/seurat/v3.2/pbmc3k_tutorial.html]).
plate.wells.384 <- paste(rep(LETTERS, each=24, len=384), rep(1:24, len=384), sep="")
# Plate 2
plate.2.transcript.counts <-
as.data.frame(
read.delim(
"data/transcript_counts/PN2_TranscriptCounts.tsv",
sep = "\t",
stringsAsFactors = F
)
)
rownames(plate.2.transcript.counts) <- plate.2.transcript.counts$GENEID
colnames(plate.2.transcript.counts)[-1] <- paste("Pn2_", plate.wells.384, sep = "")
# Plate 3
plate.3.transcript.counts <-
as.data.frame(
read.delim(
"data/transcript_counts/PN3_TranscriptCounts.tsv",
sep = "\t",
stringsAsFactors = F
)
)
rownames(plate.3.transcript.counts) <- plate.3.transcript.counts$GENEID
colnames(plate.3.transcript.counts)[-1] <- paste("Pn3_", plate.wells.384, sep = "")
# Plate 4
plate.4.transcript.counts <-
as.data.frame(
read.delim(
"data/transcript_counts/PN4_TranscriptCounts.tsv",
sep = "\t",
stringsAsFactors = F
)
)
rownames(plate.4.transcript.counts) <- plate.4.transcript.counts$GENEID
colnames(plate.4.transcript.counts)[-1] <- paste("Pn4_", plate.wells.384, sep = "")
# Plate 5
plate.5.transcript.counts <-
as.data.frame(
read.delim(
"data/transcript_counts/PN5_TranscriptCounts.tsv",
sep = "\t",
stringsAsFactors = F
)
)
colnames(plate.5.transcript.counts)[-1] <-
paste("Pn5_", plate.wells.384, sep = "")
rownames(plate.5.transcript.counts) <-
plate.5.transcript.counts$GENEID
# Plate 7
plate.7.transcript.counts <-
as.data.frame(
read.delim(
"data/transcript_counts/PN7_TranscriptCounts.tsv",
sep = "\t",
stringsAsFactors = F
)
)
colnames(plate.7.transcript.counts)[-1] <-
paste("Pn7_", plate.wells.384, sep = "")
rownames(plate.7.transcript.counts) <-
plate.7.transcript.counts$GENEID
# Plate 8
plate.8.transcript.counts <-
as.data.frame(
read.delim(
"data/transcript_counts/PN8_TranscriptCounts.tsv",
sep = "\t",
stringsAsFactors = F
)
)
colnames(plate.8.transcript.counts)[-1] <-
paste("Pn8_", plate.wells.384, sep = "")
rownames(plate.8.transcript.counts) <-
plate.8.transcript.counts$GENEID
all.plates.transcript.counts <- dplyr::full_join(plate.2.transcript.counts, plate.3.transcript.counts, by = "GENEID") %>%
full_join(., plate.4.transcript.counts, by = "GENEID") %>%
full_join(., plate.5.transcript.counts, by = "GENEID") %>%
full_join(., plate.7.transcript.counts, by = "GENEID") %>%
full_join(., plate.8.transcript.counts, by = "GENEID")
# Substitute transcript counts NAs for 0s.
all.plates.transcript.counts[is.na(all.plates.transcript.counts)] <- 0
rownames(all.plates.transcript.counts) <- all.plates.transcript.counts$GENEID
## Discard spike-in ERCC controls (not used for downstream analysis).
all.plates.transcript.counts.no.ERCCs <- all.plates.transcript.counts[-grep("^ERCC", rownames(all.plates.transcript.counts)), ]
## Create counts matrix.
all.plates.transcript.counts.no.ERCCs.mx <- as.matrix(all.plates.transcript.counts.no.ERCCs[, -1])
Please install package Danio.rerio.Zv9.77.gtf located in the project’s annotation folder. Code used to create this package described in annotation/TxDb.Drerio.Ensembl.gtf.git.Zv9.R.
gene.symbols <-
mapIds(
org.Dr.eg.db,
keys = all.plates.transcript.counts.no.ERCCs$GENEID,
keytype = "ENSEMBL",
column = "SYMBOL"
) %>% as.data.frame() %>% rownames_to_column(var = 'gene')
colnames(gene.symbols)[2] <- "symbol"
gene.locations <-
mapIds(
Danio.rerio.Zv9.77.gtf,
keys = all.plates.transcript.counts.no.ERCCs$GENEID,
column = "CDSCHROM",
keytype = "GENEID"
)
mito.genes <- gene.locations == "MT" & !is.na(gene.locations)
mito.genes <- mito.genes[which(mito.genes)] %>% names()
scs <-
CreateSeuratObject(
counts = all.plates.transcript.counts.no.ERCCs.mx,
min.cells = 25,
min.features = 250,
project = "Macrophages_scRNA-seq_Zebrafish"
)
metadata.df <- scs@meta.data
metadata.df <-
metadata.df %>% rownames_to_column(var = "well_id") %>%
dplyr::mutate(Injury = ifelse(
orig.ident == "Pn8",
"Uninjured",
ifelse(
orig.ident == "Pn2" | orig.ident == "Pn5",
"1 DPI",
ifelse(orig.ident == "Pn3", "2 DPI", "3 DPI")
)
))
injury.type <- metadata.df$Injury
names(injury.type) <- metadata.df$well_id
scs <-
AddMetaData(object = scs,
metadata = injury.type,
col.name = "injury")
scs@meta.data$injury <-
factor(scs@meta.data$injury,
levels = c("Uninjured", "1 DPI", "2 DPI", "3 DPI"))
## Add percentage of mitochondrial genes reads to metadata.
scs[["percent.mt"]] <-
PercentageFeatureSet(scs, features = mito.genes)
# Assess UMIs, features and percentage of mitochondrial reads distribution.
VlnPlot(
object = scs,
features = c("nCount_RNA", "nFeature_RNA", "percent.mt"),
group.by = "orig.ident"
)
# Plot Pn8 separately (Pn8 outliers compress y-axis scale which makes it difficult to make a clear assessment)for better visualization
VlnPlot(
object = subset(scs, subset = orig.ident != "Pn8"),
features = c("nCount_RNA", "nFeature_RNA", "percent.mt"),
group.by = "orig.ident"
)
VlnPlot(
object = subset(scs, subset = orig.ident == "Pn8"),
features = c("nCount_RNA", "nFeature_RNA", "percent.mt"),
group.by = "orig.ident"
)
# Assess UMIs to features relationship.
FeatureScatter( object = subset(scs, subset = orig.ident != "Pn8"),
feature1 = "nCount_RNA",
feature2 = "nFeature_RNA")
FeatureScatter( object = subset(scs, subset = orig.ident == "Pn8"),
feature1 = "nCount_RNA",
feature2 = "nFeature_RNA")
Based on the quality assessment from above, we filtered cells using the parameters below (global thresholds, i.e, non plate-specific, applied due to quality heterogenity of plates).
scs.filter <-
subset(
scs,
subset = percent.mt < 10 &
nFeature_RNA > 500 &
nFeature_RNA < 3500 & nCount_RNA > 1000 & nCount_RNA < 15000
)
# UMIs to features relationship post filtering.
VlnPlot(
object = scs.filter,
features = c("nCount_RNA", "nFeature_RNA", "percent.mt"),
group.by = "orig.ident"
)
FeatureScatter(object = scs.filter,
feature1 = "nCount_RNA",
feature2 = "nFeature_RNA")
scs.sce <- as.SingleCellExperiment(scs.filter, assay = "RNA")
features.QC <- perFeatureQCMetrics(scs.sce)
rownames(features.QC) <- rownames(scs.sce)
features.QC$gene_id <- rownames(features.QC)
top.500.most.expressed <- dplyr::arrange(features.QC %>% as.data.frame(), desc(mean)) %>% head(n = 500)
colnames(top.500.most.expressed)[3] <- "gene"
top.500.most.expressed.annotated <- left_join(top.500.most.expressed, gene.symbols, by = "gene")
scs.filter <-
SCTransform(scs.filter,
vars.to.regress = "percent.mt",
verbose = TRUE)
scs.filter <- RunPCA(scs.filter, verbose = FALSE, assay = 'SCT')
DimPlot(scs.filter, label = FALSE, group.by = 'injury')
ElbowPlot(scs.filter, ndims = 50)
scs.filter <-
RunUMAP(scs.filter,
dims = 1:50,
verbose = TRUE,
assay = 'SCT')
DimPlot(scs.filter, group.by = "injury")
scs.filter <-
FindNeighbors(scs.filter,
dims = 1:50,
verbose = TRUE,
assay = 'SCT')
scs.filter <-
FindClusters(
scs.filter,
verbose = TRUE,
resolution = 0.5,
assay = 'SCT'
)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 1309
## Number of edges: 51738
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8239
## Number of communities: 8
## Elapsed time: 0 seconds
DimPlot(scs.filter, label = FALSE, group.by = "ident") #+ NoLegend()
# Cluster by injury table
cluster.frequency.table <-
scs.filter@meta.data %>%
dplyr::count(seurat_clusters, injury) %>%
dplyr::group_by(seurat_clusters) %>%
dplyr::mutate(freq = n / sum(n)) %>%
ungroup()
knitr::kable(caption = "Table of cluster by injury counts and frequencies",
col.names = c("Cluster", "Injury", "n", "Freq."),
cluster.frequency.table %>% arrange(seurat_clusters)
)
| Cluster | Injury | n | Freq. |
|---|---|---|---|
| 0 | Uninjured | 2 | 0.0053908 |
| 0 | 1 DPI | 31 | 0.0835580 |
| 0 | 2 DPI | 116 | 0.3126685 |
| 0 | 3 DPI | 222 | 0.5983827 |
| 1 | Uninjured | 1 | 0.0028736 |
| 1 | 1 DPI | 315 | 0.9051724 |
| 1 | 2 DPI | 19 | 0.0545977 |
| 1 | 3 DPI | 13 | 0.0373563 |
| 2 | Uninjured | 3 | 0.0135747 |
| 2 | 1 DPI | 40 | 0.1809955 |
| 2 | 2 DPI | 59 | 0.2669683 |
| 2 | 3 DPI | 119 | 0.5384615 |
| 3 | Uninjured | 169 | 0.9285714 |
| 3 | 3 DPI | 13 | 0.0714286 |
| 4 | Uninjured | 12 | 0.1739130 |
| 4 | 1 DPI | 3 | 0.0434783 |
| 4 | 2 DPI | 10 | 0.1449275 |
| 4 | 3 DPI | 44 | 0.6376812 |
| 5 | Uninjured | 2 | 0.0408163 |
| 5 | 1 DPI | 5 | 0.1020408 |
| 5 | 2 DPI | 11 | 0.2244898 |
| 5 | 3 DPI | 31 | 0.6326531 |
| 6 | Uninjured | 8 | 0.1666667 |
| 6 | 1 DPI | 5 | 0.1041667 |
| 6 | 2 DPI | 2 | 0.0416667 |
| 6 | 3 DPI | 33 | 0.6875000 |
| 7 | Uninjured | 4 | 0.1904762 |
| 7 | 1 DPI | 1 | 0.0476190 |
| 7 | 2 DPI | 9 | 0.4285714 |
| 7 | 3 DPI | 7 | 0.3333333 |
Use for cluster marker detection, feature plots and trajectory analysis.
mmp9.vlnplot.1 <-
VlnPlot(scs.filter, features = "ENSDARG00000042816") + NoLegend() + xlab("Cluster") + ggtitle(NULL)
mmp9.vlnplot.2 <-
VlnPlot(scs.filter, features = "ENSDARG00000042816", group.by = 'injury') + NoLegend() + xlab("Time point post injury") + ggtitle(NULL)
mmp9.patchwork <- mmp9.vlnplot.1 + mmp9.vlnplot.2
mmp9.patchwork + plot_annotation(title = "mmp9 expression levels (logn counts)",
theme = theme(plot.title = element_text(face = "bold", size = 16)))
nampta.vlnplot.1 <-
VlnPlot(scs.filter, features = "ENSDARG00000030598") + NoLegend() + xlab("Cluster") + ggtitle(NULL)
nampta.vlnplot.2 <-
VlnPlot(scs.filter, features = "ENSDARG00000030598", group.by = 'injury') + NoLegend() + xlab("Time point post injury") + ggtitle(NULL)
nampta.patchwork <- nampta.vlnplot.1 + nampta.vlnplot.2
nampta.patchwork + plot_annotation(title = "nampta expression levels (logn counts)",
theme = theme(plot.title = element_text(face = "bold", size = 16)))
clusters.markers <- FindAllMarkers(scs.filter, min.pct = 0.2, assay = 'RNA', test.use = "wilcox")
de.clusters.markers <- clusters.markers %>% dplyr::filter(p_val_adj < 0.05)
Please note that the percentage values described in Figure 3.e plots were obtained from the cluster.markers dataframe, for cluster specific gene expression, or top.500.most.expressed dataframe, when global gene expression was described.
selected.genes <- c("lcp1", "cd63", "arg2", "mmp13a", "mmp9", "nampta")
# Examples of how to obtain percentage of cells expressing cells selected markers.
# dplyr::filter(top.500.most.expressed, gene %in% dplyr::filter(gene.symbols, symbol %in% selected.genes)$gene)
# dplyr::filter(clusters.markers, gene %in% dplyr::filter(gene.symbols, symbol %in% selected.genes)$gene, cluster == "2")
selected.genes.annotated <- dplyr::filter(gene.symbols, symbol %in% selected.genes)
selected.genes.annotated <-
selected.genes.annotated %>% arrange(factor(symbol,
levels = selected.genes))
markers.set.plot <-
FeaturePlot(
object = scs.filter,
features = selected.genes.annotated$gene,
cols = c("lightblue", "darkred"),
pt.size = 2,
combine = FALSE
)
names(markers.set.plot) <- selected.genes.annotated$symbol
markers.set.plot.l <- lapply(names(markers.set.plot), function(x) markers.set.plot[[x]] + labs(title = x))
cowplot::plot_grid(plotlist = markers.set.plot.l)
antigen.processing.presenting.genes <- c("cd83", "cd81a", "cd40","cd74b", "cd9b", "cd164", "cd276", "cd99", "cd82b", "cd82a", "cd99l2", "cd74a")
# Examples of how to obtain percentage of cells expressing antigen processing and presentation genes.
# dplyr::filter(top.500.most.expressed, gene %in% dplyr::filter(gene.symbols, symbol %in% antigen.processing.presenting.genes)$gene)
# dplyr::filter(clusters.markers, gene %in% dplyr::filter(gene.symbols, symbol %in% antigen.processing.presenting.genes)$gene, cluster == "1")
antigen.processing.presenting.genes.annotated <-
dplyr::filter(gene.symbols, symbol %in% antigen.processing.presenting.genes)
antigen.processing.presenting.genes.annotated <-
antigen.processing.presenting.genes.annotated %>% arrange(factor(
symbol,
levels = antigen.processing.presenting.genes)
)
antigen.processing.presenting.genes.plot <-
FeaturePlot(
object = scs.filter,
features = antigen.processing.presenting.genes.annotated$gene,
cols = c("lightblue", "darkred"),
pt.size = 2,
combine = FALSE
)
names(antigen.processing.presenting.genes.plot) <- antigen.processing.presenting.genes.annotated$symbol
antigen.processing.presenting.genes.plot.l <- lapply(names(antigen.processing.presenting.genes.plot), function(x) antigen.processing.presenting.genes.plot[[x]] + labs(title = x))
cowplot::plot_grid(plotlist = antigen.processing.presenting.genes.plot.l)
# pou2f3 == ENSDARG00000052387
VlnPlot(scs.filter, features = "ENSDARG00000052387") + NoLegend() + xlab("Cluster") + ggtitle("pou2f3")
VlnPlot(scs.filter, features = "ENSDARG00000055158") + NoLegend() + xlab("Cluster") + ggtitle("prox1a")
See Trajectory_Analysis.ipynb
save(scs.filter, file = "data/objects/scs_filter.Rdata")
sessionInfo()
## R version 3.6.2 (2019-12-12)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
##
## locale:
## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8
## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8
## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] patchwork_1.1.0 Danio.rerio.Zv9.77.gtf_1.0.0
## [3] org.Dr.eg.db_3.10.0 GenomicFeatures_1.38.2
## [5] AnnotationDbi_1.48.0 dplyr_1.0.2
## [7] tibble_3.0.3 Seurat_3.1.1
## [9] scater_1.14.6 ggplot2_3.3.2
## [11] SingleCellExperiment_1.8.0 SummarizedExperiment_1.16.1
## [13] DelayedArray_0.12.3 BiocParallel_1.20.1
## [15] matrixStats_0.56.0 Biobase_2.46.0
## [17] GenomicRanges_1.38.0 GenomeInfoDb_1.22.1
## [19] IRanges_2.20.2 S4Vectors_0.24.4
## [21] BiocGenerics_0.32.0
##
## loaded via a namespace (and not attached):
## [1] BiocFileCache_1.10.2 sn_1.6-2 plyr_1.8.6
## [4] igraph_1.2.5 lazyeval_0.2.2 splines_3.6.2
## [7] listenv_0.8.0 TH.data_1.0-10 digest_0.6.25
## [10] htmltools_0.5.0 viridis_0.5.1 magrittr_1.5
## [13] memoise_1.1.0 cluster_2.1.0 ROCR_1.0-11
## [16] globals_0.12.5 Biostrings_2.54.0 RcppParallel_5.0.2
## [19] R.utils_2.9.2 sandwich_2.5-1 askpass_1.1
## [22] prettyunits_1.1.1 colorspace_1.4-1 blob_1.2.1
## [25] rappdirs_0.3.1 ggrepel_0.8.2 xfun_0.16
## [28] crayon_1.3.4 RCurl_1.98-1.2 jsonlite_1.7.0
## [31] survival_3.1-8 zoo_1.8-8 ape_5.4-1
## [34] glue_1.4.1 gtable_0.3.0 zlibbioc_1.32.0
## [37] XVector_0.26.0 leiden_0.3.3 BiocSingular_1.2.2
## [40] future.apply_1.6.0 scales_1.1.1 mvtnorm_1.1-1
## [43] DBI_1.1.0 bibtex_0.4.2.2 Rcpp_1.0.5
## [46] metap_1.4 plotrix_3.7-8 progress_1.2.2
## [49] viridisLite_0.3.0 reticulate_1.16 bit_4.0.4
## [52] rsvd_1.0.3 SDMTools_1.1-221 tsne_0.1-3
## [55] htmlwidgets_1.5.1 httr_1.4.2 RColorBrewer_1.1-2
## [58] TFisher_0.2.0 ellipsis_0.3.1 ica_1.0-2
## [61] farver_2.0.3 XML_3.99-0.3 pkgconfig_2.0.3
## [64] R.methodsS3_1.8.0 dbplyr_2.0.0 uwot_0.1.5
## [67] labeling_0.3 tidyselect_1.1.0 rlang_0.4.7
## [70] reshape2_1.4.4 munsell_0.5.0 tools_3.6.2
## [73] generics_0.0.2 RSQLite_2.2.0 mathjaxr_1.0-1
## [76] ggridges_0.5.2 evaluate_0.14 stringr_1.4.0
## [79] yaml_2.2.1 knitr_1.29 bit64_4.0.2
## [82] fitdistrplus_1.1-1 purrr_0.3.4 RANN_2.6.1
## [85] pbapply_1.4-3 future_1.18.0 nlme_3.1-143
## [88] R.oo_1.23.0 biomaRt_2.42.1 compiler_3.6.2
## [91] curl_4.3 beeswarm_0.2.3 plotly_4.9.2.1
## [94] png_0.1-7 stringi_1.4.6 highr_0.8
## [97] RSpectra_0.16-0 lattice_0.20-38 Matrix_1.2-18
## [100] multtest_2.42.0 vctrs_0.3.2 mutoss_0.1-12
## [103] pillar_1.4.6 lifecycle_0.2.0 Rdpack_1.0.0
## [106] lmtest_0.9-37 RcppAnnoy_0.0.16 BiocNeighbors_1.4.2
## [109] data.table_1.13.0 cowplot_1.0.0 bitops_1.0-6
## [112] irlba_2.3.3 gbRd_0.4-11 rtracklayer_1.46.0
## [115] R6_2.4.1 KernSmooth_2.23-16 gridExtra_2.3
## [118] vipor_0.4.5 codetools_0.2-16 assertthat_0.2.1
## [121] MASS_7.3-51.5 openssl_1.4.2 withr_2.2.0
## [124] GenomicAlignments_1.22.1 Rsamtools_2.2.3 sctransform_0.2.1
## [127] mnormt_1.5-6 multcomp_1.4-13 GenomeInfoDbData_1.2.2
## [130] hms_0.5.3 grid_3.6.2 tidyr_1.1.1
## [133] rmarkdown_2.5 DelayedMatrixStats_1.8.0 Rtsne_0.15
## [136] numDeriv_2016.8-1.1 ggbeeswarm_0.6.0